Chapter 5 Community composition

5.1 Gut microbiota

load("data/gut/data.Rdata")

5.1.1 Taxonomy overview

5.1.1.1 Phylum level

genome_counts_filt %>%
  mutate_at(vars(-genome),~./sum(.)) %>% #apply TSS nornalisation
  pivot_longer(-genome, names_to = "sample", values_to = "count") %>% #reduce to minimum number of columns
  left_join(., genome_metadata, by = join_by(genome == genome)) %>% #append genome metadata
  left_join(., sample_metadata, by = join_by(sample == sample)) %>% #append sample metadata
  filter(count > 0) %>% #filter 0 counts
  ggplot(., aes(x=sample,y=count, fill=phylum, group=phylum)) + #grouping enables keeping the same sorting of taxonomic units
    geom_bar(stat="identity", colour="white", linewidth=0.1) + #plot stacked bars with white borders
    scale_fill_manual(values=phylum_colors) +
    facet_nested(~factor(environment, labels=c("low" = "Low altitude", "high" = "High altitude")),  scales="free") + #facet per day and treatment
    guides(fill = guide_legend(ncol = 1)) +
    theme(axis.text.x = element_blank(),
          axis.ticks.x = element_blank(),
          axis.title.x = element_blank(),
          panel.background = element_blank(),
          panel.border = element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          strip.background = element_rect(fill = "white"),
          strip.text = element_text(size = 12, lineheight = 0.6,face="bold"),
          axis.line = element_line(linewidth = 0.5, linetype = "solid", colour = "black")) +
   labs(fill="Phylum",y = "Relative abundance",x="Samples")

Number of MAGs

539

Number of bacteria phyla

13

Number of Archaea phyla

0

Phylum relative abundances

phylum_summary <- genome_counts_filt %>%
  mutate_at(vars(-genome),~./sum(.)) %>%
  pivot_longer(-genome, names_to = "sample", values_to = "count") %>%
  left_join(sample_metadata, by = join_by(sample == sample)) %>%
  left_join(genome_metadata, by = join_by(genome == genome)) %>%
  group_by(sample,phylum,river, environment) %>%
  summarise(relabun=sum(count))
phylum_summary %>%
    group_by(phylum) %>%
    summarise(total_mean=mean(relabun*100, na.rm=T),
              total_sd=sd(relabun*100, na.rm=T),
              High_mean=mean(relabun[environment=="high"]*100, na.rm=T),
             High_sd=sd(relabun[environment=="high"]*100, na.rm=T),
              Low_mean=mean(relabun[environment=="low"]*100, na.rm=T),
              Low_sd=sd(relabun[environment=="low"]*100, na.rm=T)) %>%
    mutate(Total=str_c(round(total_mean,3),"±",round(total_sd,3)),
           High=str_c(round(High_mean,3),"±",round(High_sd,3)),
           Low=str_c(round(Low_mean,3),"±",round(Low_sd,3))) %>% 
    arrange(-total_mean) %>% 
    dplyr::select(phylum,Total,High,Low)
# A tibble: 13 × 4
   phylum               Total         High          Low          
   <chr>                <chr>         <chr>         <chr>        
 1 p__Bacteroidota      56.017±16.856 57.802±15.145 54.344±18.655
 2 p__Bacillota_A       17.723±6.394  16.389±6.41   18.973±6.322 
 3 p__Pseudomonadota    11.568±13.498 8.774±7.907   14.187±17.056
 4 p__Bacillota         5.475±8.813   6.263±12.015  4.736±4.405  
 5 p__Verrucomicrobiota 4.116±4.507   4.218±4.909   4.02±4.256   
 6 p__Desulfobacterota  1.795±1.741   1.747±1.616   1.839±1.902  
 7 p__Fusobacteriota    1.369±2.181   2.692±2.505   0.128±0.513  
 8 p__Bacillota_C       0.653±0.926   0.426±0.37    0.865±1.22   
 9 p__Deferribacterota  0.602±0.769   0.923±0.952   0.3±0.369    
10 p__Cyanobacteriota   0.37±0.498    0.36±0.511    0.379±0.503  
11 p__Bacillota_B       0.155±0.141   0.191±0.143   0.12±0.134   
12 p__Elusimicrobiota   0.126±0.428   0.199±0.59    0.058±0.175  
13 p__Chlamydiota       0.033±0.084   0.015±0.039   0.049±0.11   
phylum_arrange <- phylum_summary %>%
    group_by(phylum) %>%
    summarise(mean=sum(relabun)) %>%
    arrange(-mean) %>%
    select(phylum) %>%
    pull()

phylum_summary %>%
    left_join(genome_metadata %>% select(phylum,phylum) %>% unique(),by=join_by(phylum==phylum)) %>%
#    left_join(sample_metadata,by=join_by(sample==sample)) %>%
    filter(phylum %in% phylum_arrange[1:20]) %>%
    mutate(phylum=factor(phylum,levels=rev(phylum_arrange[1:20]))) %>%
    filter(relabun > 0) %>%
    ggplot(aes(x=relabun, y=phylum, group=phylum, color=phylum)) +
        scale_color_manual(values=phylum_colors[-8]) +
        geom_jitter(alpha=0.5) + 
        facet_grid(.~environment)+
        theme_minimal() + 
        labs(y="phylum", x="Relative abundance", color="Phylum")

Bacteria phyla in individuals from low altitude

13

Bacteria phyla in individuals from high altitude

13
phylum_summary %>%
    group_by(phylum) %>%
    summarise(total_mean=mean(relabun*100, na.rm=T),
              total_sd=sd(relabun*100, na.rm=T),
              le_mean=mean(relabun[river=="Leitzaran"]*100, na.rm=T),
             le_sd=sd(relabun[river=="Leitzaran"]*100, na.rm=T),
              ha_mean=mean(relabun[river=="Harpea"]*100, na.rm=T),
              ha_sd=sd(relabun[river=="Harpea"]*100, na.rm=T),
              er_mean=mean(relabun[river=="Erlan"]*100, na.rm=T),
              er_sd=sd(relabun[river=="Erlan"]*100, na.rm=T),
              go_mean=mean(relabun[river=="Goizueta"]*100, na.rm=T),
              go_sd=sd(relabun[river=="Goizueta"]*100, na.rm=T)) %>%
    mutate(Total=str_c(round(total_mean,3),"±",round(total_sd,3)),
           Leitzaran=str_c(round(le_mean,3),"±",round(le_sd,3)),
           Harpea=str_c(round(ha_mean,3),"±",round(ha_sd,3)),
           Erlan=str_c(round(er_mean,3),"±",round(er_sd,3)),
           Goizueta=str_c(round(go_mean,3),"±",round(go_sd,3))) %>% 
    arrange(-total_mean) %>% 
    dplyr::select(phylum,Total,Leitzaran,Goizueta,Harpea,Erlan)
# A tibble: 13 × 6
   phylum               Total         Leitzaran     Goizueta      Harpea        Erlan       
   <chr>                <chr>         <chr>         <chr>         <chr>         <chr>       
 1 p__Bacteroidota      56.017±16.856 60.117±14.174 49.855±21.212 56.148±20.609 59.249±9.464
 2 p__Bacillota_A       17.723±6.394  19.382±7.428  18.654±5.77   16.773±5.991  16.053±7.151
 3 p__Pseudomonadota    11.568±13.498 12.81±9.399   15.259±21.823 7.394±8.916   9.981±7.303 
 4 p__Bacillota         5.475±8.813   3.368±2.906   5.8±5.209     10.8±16.871   2.293±2.479 
 5 p__Verrucomicrobiota 4.116±4.507   2.429±3.463   5.257±4.586   1.69±1.837    6.431±5.773 
 6 p__Desulfobacterota  1.795±1.741   0.907±1.033   2.565±2.152   1.919±2.224   1.597±0.966 
 7 p__Fusobacteriota    1.369±2.181   0.293±0.775   0±0           3.777±2.857   1.743±1.828 
 8 p__Bacillota_C       0.653±0.926   0.278±0.324   1.322±1.475   0.321±0.194   0.519±0.469 
 9 p__Deferribacterota  0.602±0.769   0.175±0.305   0.397±0.402   0.684±0.385   1.132±1.257 
10 p__Cyanobacteriota   0.37±0.498    0.095±0.215   0.601±0.559   0.264±0.461   0.444±0.569 
11 p__Bacillota_B       0.155±0.141   0.146±0.154   0.1±0.121     0.199±0.128   0.184±0.162 
12 p__Elusimicrobiota   0.126±0.428   0±0           0.102±0.228   0±0           0.374±0.789 
13 p__Chlamydiota       0.033±0.084   0±0           0.088±0.138   0.031±0.054   0±0         

5.1.1.2 Family level

Percentange of families in each group

family_summary <- genome_counts_filt %>%
  mutate_at(vars(-genome),~./sum(.)) %>% #apply TSS nornalisation
  pivot_longer(-genome, names_to = "sample", values_to = "count") %>% #reduce to minimum number of columns
  left_join(sample_metadata, by = join_by(sample == sample)) %>% #append sample metadata
  left_join(., genome_metadata, by = join_by(genome == genome)) %>% #append genome metadata
  group_by(sample,family, river,environment) %>%
  summarise(relabun=sum(count))
family_summary %>%
    group_by(family) %>%
    summarise(total_mean=mean(relabun*100, na.rm=T),
              total_sd=sd(relabun*100, na.rm=T),
              High_mean=mean(relabun[environment=="high"]*100, na.rm=T),
             High_sd=sd(relabun[environment=="high"]*100, na.rm=T),
              Low_mean=mean(relabun[environment=="low"]*100, na.rm=T),
              Low_sd=sd(relabun[environment=="low"]*100, na.rm=T)) %>%
    mutate(Total=str_c(round(total_mean,3),"±",round(total_sd,3)),
           High=str_c(round(High_mean,3),"±",round(High_sd,3)),
           Low=str_c(round(Low_mean,3),"±",round(Low_sd,3))) %>% 
    arrange(-total_mean) %>% 
    dplyr::select(family,Total,High,Low)
# A tibble: 59 × 4
   family                Total         High         Low         
   <chr>                 <chr>         <chr>        <chr>       
 1 f__Bacteroidaceae     27.472±14.072 26.66±11.936 28.232±16.18
 2 f__Rikenellaceae      13.92±7.397   18.443±4.375 9.679±7.206 
 3 f__Tannerellaceae     7.721±4.561   5.819±2.671  9.504±5.286 
 4 f__Ruminococcaceae    5.794±4.189   6.941±3.771  4.718±4.391 
 5 f__Lachnospiraceae    4.729±3.338   3.083±2.906  6.272±3.025 
 6 f__Enterobacteriaceae 4.336±9.482   2.673±4.547  5.896±12.456
 7 f__Akkermansiaceae    3.968±4.368   4.044±4.731  3.897±4.155 
 8 f__Marinifilaceae     3.739±3.555   2.958±1.959  4.471±4.529 
 9 f__Aeromonadaceae     3.216±5.116   1.775±2.855  4.567±6.381 
10 f__Mycoplasmoidaceae  3.122±8.895   4.384±12.189 1.939±4.062 
# ℹ 49 more rows
family_arrange <- family_summary %>%
    group_by(family) %>%
    summarise(mean=sum(relabun)) %>%
    arrange(-mean) %>%
    select(family) %>%
    pull()

# Per environment
family_summary %>%
    left_join(genome_metadata %>% select(family,phylum) %>% unique(),by=join_by(family==family)) %>%
    filter(family %in% family_arrange[1:20]) %>%
    mutate(family=factor(family,levels=rev(family_arrange[1:20]))) %>%
    filter(relabun > 0) %>%
    ggplot(aes(x=relabun, y=family, group=family, color=phylum)) +
        scale_color_manual(values=phylum_colors[-8]) +
        geom_jitter(alpha=0.5) + 
        facet_grid(.~environment)+
        theme_minimal() + 
        labs(y="Family", x="Relative abundance", color="Phylum")

family_summary %>%
    group_by(family) %>%
    summarise(total_mean=mean(relabun*100, na.rm=T),
              total_sd=sd(relabun*100, na.rm=T),
              le_mean=mean(relabun[river=="Leitzaran"]*100, na.rm=T),
             le_sd=sd(relabun[river=="Leitzaran"]*100, na.rm=T),
              ha_mean=mean(relabun[river=="Harpea"]*100, na.rm=T),
              ha_sd=sd(relabun[river=="Harpea"]*100, na.rm=T),
              er_mean=mean(relabun[river=="Erlan"]*100, na.rm=T),
              er_sd=sd(relabun[river=="Erlan"]*100, na.rm=T),
              go_mean=mean(relabun[river=="Goizueta"]*100, na.rm=T),
              go_sd=sd(relabun[river=="Goizueta"]*100, na.rm=T)) %>%
    mutate(Total=str_c(round(total_mean,3),"±",round(total_sd,3)),
           Leitzaran=str_c(round(le_mean,3),"±",round(le_sd,3)),
           Harpea=str_c(round(ha_mean,3),"±",round(ha_sd,3)),
           Erlan=str_c(round(er_mean,3),"±",round(er_sd,3)),
           Goizueta=str_c(round(go_mean,3),"±",round(go_sd,3))) %>% 
    arrange(-total_mean) %>% 
    dplyr::select(family,Total,Leitzaran,Goizueta,Harpea,Erlan)
# A tibble: 59 × 6
   family                Total         Leitzaran     Goizueta      Harpea       Erlan        
   <chr>                 <chr>         <chr>         <chr>         <chr>        <chr>        
 1 f__Bacteroidaceae     27.472±14.072 34.786±20.422 23.135±10.548 27.17±12.675 26.215±12.113
 2 f__Rikenellaceae      13.92±7.397   8.911±9.05    10.277±5.916  17.11±5.122  19.609±3.531 
 3 f__Tannerellaceae     7.721±4.561   9.923±5.108   9.177±5.705   5.979±2.836  5.679±2.706  
 4 f__Ruminococcaceae    5.794±4.189   4.235±4.644   5.094±4.428   6.708±3.914  7.146±3.9    
 5 f__Lachnospiraceae    4.729±3.338   5.815±2.118   6.629±3.671   2.699±1.253  3.419±3.907  
 6 f__Enterobacteriaceae 4.336±9.482   4.222±2.926   7.198±16.738  2.824±5.913  2.54±3.368   
 7 f__Akkermansiaceae    3.968±4.368   2.429±3.463   5.04±4.475    1.69±1.837   6.104±5.61   
 8 f__Marinifilaceae     3.739±3.555   4.765±6.475   4.243±2.624   2.306±1.732  3.528±2.075  
 9 f__Aeromonadaceae     3.216±5.116   6.023±7.391   3.435±5.664   0.515±0.676  2.878±3.597  
10 f__Mycoplasmoidaceae  3.122±8.895   1.267±2.044   2.462±5.207   8.878±17.359 0.452±0.999  
# ℹ 49 more rows

5.1.1.3 Genus level

*** Percetange of genera in each group***

genus_summary <- genome_counts_filt %>%
  mutate_at(vars(-genome),~./sum(.)) %>% #apply TSS nornalisation
  pivot_longer(-genome, names_to = "sample", values_to = "count") %>% #reduce to minimum number of columns
  left_join(sample_metadata, by = join_by(sample == sample)) %>% #append sample metadata
  left_join(genome_metadata, by = join_by(genome == genome)) %>% #append genome metadata
  group_by(sample,phylum,genus, environment) %>%
  summarise(relabun=sum(count)) 
# %>%
#   filter(genus != "g__") %>%
#   mutate(genus= sub("^g__", "", genus))

genus_summary %>%
    group_by(genus) %>%
    summarise(total_mean=mean(relabun*100, na.rm=T),
              total_sd=sd(relabun*100, na.rm=T),
              High_mean=mean(relabun[environment=="high"]*100, na.rm=T),
             High_sd=sd(relabun[environment=="high"]*100, na.rm=T),
              Low_mean=mean(relabun[environment=="low"]*100, na.rm=T),
              Low_sd=sd(relabun[environment=="low"]*100, na.rm=T)) %>%
    mutate(Total=str_c(round(total_mean,3),"±",round(total_sd,3)),
           High=str_c(round(High_mean,3),"±",round(High_sd,3)),
           Low=str_c(round(Low_mean,3),"±",round(Low_sd,3))) %>% 
    arrange(-total_mean) %>% 
    dplyr::select(genus,Total,High,Low) 
# A tibble: 112 × 4
   genus              Total         High         Low         
   <chr>              <chr>         <chr>        <chr>       
 1 g__Bacteroides     26.763±13.992 26.077±11.54 27.407±16.32
 2 g__Parabacteroides 6.376±3.96    4.975±2.577  7.69±4.622  
 3 g__Mucinivorans    6.085±4.345   8.178±3.614  4.123±4.132 
 4 g__Aeromonas       3.216±5.116   1.775±2.855  4.567±6.381 
 5 g__Mycoplasma_L    2.603±8.972   4.309±12.215 1.003±4.012 
 6 g__Akkermansia     2.441±3.614   2.187±4.075  2.678±3.241 
 7 g__Odoribacter     2.389±2.236   1.859±1.22   2.886±2.84  
 8 g__JADFUS01        2.217±1.317   2.721±1.198  1.744±1.279 
 9 g__Hafnia          1.923±9.189   0.09±0.147   3.641±12.741
10 g__UBA866          1.817±2.179   1.759±1.596  1.871±2.668 
# ℹ 102 more rows
genus_summary_sort <- genus_summary %>%
    group_by(genus) %>%
    summarise(mean=mean(relabun, na.rm=T),sd=sd(relabun, na.rm=T)) %>%
    arrange(-mean) 

genus_summary %>%
  mutate(genus=factor(genus, levels=rev(genus_summary_sort %>% pull(genus)))) %>%
  filter(relabun > 0) %>%
  ggplot(aes(x=relabun, y=genus, group=genus, color=phylum)) +
  scale_color_manual(values=phylum_colors) +
  geom_jitter(alpha=0.5) + 
  facet_grid(.~environment)+
  theme_minimal() + 
  theme(axis.text.y = element_text(size=6))+
  labs(y="Family", x="Relative abundance", color="Phylum")

Number of MAGs without genera-level annotation

133
phylum count_nogene count_total percentage
p__Bacillota 4 23 17.39130
p__Bacillota_A 45 175 25.71429
p__Bacillota_B 3 4 75.00000
p__Bacillota_C 11 11 100.00000
p__Bacteroidota 24 191 12.56545
p__Chlamydiota 1 1 100.00000
p__Cyanobacteriota 7 12 58.33333
p__Deferribacterota 2 2 100.00000
p__Desulfobacterota 4 7 57.14286
p__Elusimicrobiota 1 2 50.00000
p__Pseudomonadota 18 87 20.68966
p__Verrucomicrobiota 13 18 72.22222

Percentage of MAGs without genus-level annotation

24.67532

Number of MAGs without species-level annotation

# A tibble: 1 × 1
  Mag_nospecies
          <int>
1           494
phylum count_nospecies count_total species_annotated percentage percentage_species
p__Bacillota 22 23 1 95.65217 4.347826
p__Bacillota_A 175 175 0 100.00000 0.000000
p__Bacillota_B 4 4 0 100.00000 0.000000
p__Bacillota_C 11 11 0 100.00000 0.000000
p__Bacteroidota 191 191 0 100.00000 0.000000
p__Chlamydiota 1 1 0 100.00000 0.000000
p__Cyanobacteriota 12 12 0 100.00000 0.000000
p__Deferribacterota 2 2 0 100.00000 0.000000
p__Desulfobacterota 7 7 0 100.00000 0.000000
p__Elusimicrobiota 2 2 0 100.00000 0.000000
p__Fusobacteriota 4 6 2 66.66667 33.333333
p__Pseudomonadota 45 87 42 51.72414 48.275862
p__Verrucomicrobiota 18 18 0 100.00000 0.000000

Percentage of MAGs without species-level annotation

91.65121

5.2 Skin microbiota

load("data/skin/data.Rdata")

5.2.1 Taxonomy overview

5.2.1.1 Phylum level

genome_counts_filt %>%
  mutate_at(vars(-genome),~./sum(.)) %>% #apply TSS nornalisation
  pivot_longer(-genome, names_to = "sample", values_to = "count") %>% #reduce to minimum number of columns
  left_join(., genome_metadata, by = join_by(genome == genome)) %>% #append genome metadata
  left_join(., sample_metadata, by = join_by(sample == sample)) %>% #append sample metadata
  filter(count > 0) %>% #filter 0 counts
  ggplot(., aes(x=sample,y=count, fill=phylum, group=phylum)) + #grouping enables keeping the same sorting of taxonomic units
    geom_bar(stat="identity", colour="white", linewidth=0.1) + #plot stacked bars with white borders
    scale_fill_manual(values=phylum_colors) +
    facet_nested(~factor(environment, labels=c("low" = "Low altitude", "high" = "High altitude")),  scales="free") + #facet per day and treatment
    guides(fill = guide_legend(ncol = 1)) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
          axis.ticks.x = element_blank(),
          axis.title.x = element_blank(),
          panel.background = element_blank(),
          panel.border = element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          strip.background = element_rect(fill = "white"),
          strip.text = element_text(size = 12, lineheight = 0.6,face="bold"),
          axis.line = element_line(linewidth = 0.5, linetype = "solid", colour = "black")) +
   labs(fill="Phylum",y = "Relative abundance",x="Samples")

Number of MAGs

43

Number of bacteria phyla

3

Number of Archaea phyla

0

Phylum relative abundances

phylum_summary <- genome_counts_filt %>%
  mutate_at(vars(-genome),~./sum(.)) %>%
  pivot_longer(-genome, names_to = "sample", values_to = "count") %>%
  left_join(sample_metadata, by = join_by(sample == sample)) %>%
  left_join(genome_metadata, by = join_by(genome == genome)) %>%
  group_by(sample,phylum, environment, river) %>%
  summarise(relabun=sum(count))
phylum_summary %>%
    group_by(phylum) %>%
    summarise(total_mean=mean(relabun*100, na.rm=T),
              total_sd=sd(relabun*100, na.rm=T),
              High_mean=mean(relabun[environment=="high"]*100, na.rm=T),
             High_sd=sd(relabun[environment=="high"]*100, na.rm=T),
              Low_mean=mean(relabun[environment=="low"]*100, na.rm=T),
              Low_sd=sd(relabun[environment=="low"]*100, na.rm=T)) %>%
    mutate(Total=str_c(round(total_mean,3),"±",round(total_sd,3)),
           High=str_c(round(High_mean,3),"±",round(High_sd,3)),
           Low=str_c(round(Low_mean,6),"±",round(Low_sd,3))) %>% 
    arrange(-total_mean) %>% 
    dplyr::select(phylum,Total,High,Low)
# A tibble: 3 × 4
  phylum            Total         High         Low            
  <chr>             <chr>         <chr>        <chr>          
1 p__Pseudomonadota 84.888±15.792 88.597±8.902 82.415673±18.92
2 p__Bacteroidota   14.869±15.806 10.797±8.675 17.584327±18.92
3 p__               0.242±1.198   0.606±1.882  0±0            
phylum_arrange <- phylum_summary %>%
    group_by(phylum) %>%
    summarise(mean=sum(relabun)) %>%
    arrange(-mean) %>%
    select(phylum) %>%
    pull()

phylum_summary %>%
    left_join(genome_metadata %>% select(phylum,phylum) %>% unique(),by=join_by(phylum==phylum)) %>%
#    left_join(sample_metadata,by=join_by(sample==sample)) %>%
    filter(phylum %in% phylum_arrange[1:20]) %>%
    mutate(phylum=factor(phylum,levels=rev(phylum_arrange[1:20]))) %>%
    filter(relabun > 0) %>%
    ggplot(aes(x=relabun, y=phylum, group=phylum, color=phylum)) +
        scale_color_manual(values=phylum_colors[-8]) +
        geom_jitter(alpha=0.5) + 
        facet_grid(.~environment)+
        theme_minimal() + 
        labs(y="phylum", x="Relative abundance", color="Phylum")

Bacteria phyla in individuals from low altitude

[1] 2

Bacteria phyla in individuals from high altitude

[1] 3
phylum_summary %>%
    group_by(phylum) %>%
    summarise(total_mean=mean(relabun*100, na.rm=T),
              total_sd=sd(relabun*100, na.rm=T),
              le_mean=mean(relabun[river=="Leitzaran"]*100, na.rm=T),
             le_sd=sd(relabun[river=="Leitzaran"]*100, na.rm=T),
              ha_mean=mean(relabun[river=="Harpea"]*100, na.rm=T),
              ha_sd=sd(relabun[river=="Harpea"]*100, na.rm=T),
              er_mean=mean(relabun[river=="Erlan"]*100, na.rm=T),
              er_sd=sd(relabun[river=="Erlan"]*100, na.rm=T),
              go_mean=mean(relabun[river=="Goizueta"]*100, na.rm=T),
              go_sd=sd(relabun[river=="Goizueta"]*100, na.rm=T)) %>%
    mutate(Total=str_c(round(total_mean,3),"±",round(total_sd,3)),
           Leitzaran=str_c(round(le_mean,6),"±",round(le_sd,3)),
           Harpea=str_c(round(ha_mean,6),"±",round(ha_sd,3)),
           Erlan=str_c(round(er_mean,6),"±",round(er_sd,3)),
           Goizueta=str_c(round(go_mean,6),"±",round(go_sd,3))) %>% 
    arrange(-total_mean) %>% 
    dplyr::select(phylum,Total,Leitzaran,Goizueta,Harpea,Erlan)
# A tibble: 3 × 6
  phylum            Total         Leitzaran       Goizueta         Harpea          Erlan           
  <chr>             <chr>         <chr>           <chr>            <chr>           <chr>           
1 p__Pseudomonadota 84.888±15.792 79.00655±24.425 85.824795±11.744 92.755803±3.456 86.517677±10.227
2 p__Bacteroidota   14.869±15.806 20.99345±24.425 14.175205±11.744 7.244197±3.456  12.573272±10.116
3 p__               0.242±1.198   0±0             0±0              0±0             0.909051±2.292  

5.2.1.2 Family level

Percentange of families in each group

family_summary <- genome_counts_filt %>%
  mutate_at(vars(-genome),~./sum(.)) %>% #apply TSS nornalisation
  pivot_longer(-genome, names_to = "sample", values_to = "count") %>% #reduce to minimum number of columns
  left_join(sample_metadata, by = join_by(sample == sample)) %>% #append sample metadata
  left_join(., genome_metadata, by = join_by(genome == genome)) %>% #append genome metadata
  group_by(sample,family, river,environment) %>%
  summarise(relabun=sum(count))
family_summary %>%
    group_by(family) %>%
    summarise(total_mean=mean(relabun*100, na.rm=T),
              total_sd=sd(relabun*100, na.rm=T),
              High_mean=mean(relabun[environment=="high"]*100, na.rm=T),
             High_sd=sd(relabun[environment=="high"]*100, na.rm=T),
              Low_mean=mean(relabun[environment=="low"]*100, na.rm=T),
              Low_sd=sd(relabun[environment=="low"]*100, na.rm=T)) %>%
    mutate(Total=str_c(round(total_mean,3),"±",round(total_sd,3)),
           High=str_c(round(High_mean,3),"±",round(High_sd,3)),
           Low=str_c(round(Low_mean,3),"±",round(Low_sd,3))) %>% 
    arrange(-total_mean) %>% 
    dplyr::select(family,Total,High,Low)
# A tibble: 14 × 4
   family                Total         High          Low          
   <chr>                 <chr>         <chr>         <chr>        
 1 f__Methylophilaceae   29.667±26.384 48.878±23.032 16.86±20.25  
 2 f__Burkholderiaceae_B 27.235±24.427 18.967±12.87  32.747±28.814
 3 f__Pseudomonadaceae   17.505±22.204 12.039±14.84  21.149±25.756
 4 f__Spirosomaceae      13.217±15.853 7.659±6.105   16.923±19.189
 5 f__Moraxellaceae      3.763±9.946   5.429±14.848  2.652±4.779  
 6 f__Burkholderiaceae   2.468±5.746   0.498±1.071   3.781±7.142  
 7 f__Burkholderiaceae_A 1.898±6.249   0.153±0.531   3.061±7.928  
 8 f__Sphingomonadaceae  1.403±4.051   1.256±1.956   1.5±5.049    
 9 f__Flavobacteriaceae  1.282±4.516   3.138±6.889   0.044±0.188  
10 f__Alteromonadaceae   0.671±1.439   0.981±1.532   0.465±1.378  
11 f__Weeksellaceae      0.37±2.028    0±0           0.617±2.618  
12 f__                   0.242±1.198   0.606±1.882   0±0          
13 f__UBA6184            0.158±0.866   0.395±1.37    0±0          
14 f__Rhodocyclaceae     0.12±0.462    0±0           0.2±0.59     
family_arrange <- family_summary %>%
    group_by(family) %>%
    summarise(mean=sum(relabun)) %>%
    arrange(-mean) %>%
    select(family) %>%
    pull()

# Per environment
family_summary %>%
    left_join(genome_metadata %>% select(family,phylum) %>% unique(),by=join_by(family==family)) %>%
    filter(family %in% family_arrange[1:20]) %>%
    mutate(family=factor(family,levels=rev(family_arrange[1:20]))) %>%
    filter(relabun > 0) %>%
    ggplot(aes(x=relabun, y=family, group=family, color=phylum)) +
        scale_color_manual(values=phylum_colors[-8]) +
        geom_jitter(alpha=0.5) + 
        facet_grid(.~environment)+
        theme_minimal() + 
        labs(y="Family", x="Relative abundance", color="Phylum")

family_summary %>%
    group_by(family) %>%
    summarise(total_mean=mean(relabun*100, na.rm=T),
              total_sd=sd(relabun*100, na.rm=T),
              le_mean=mean(relabun[river=="Leitzaran"]*100, na.rm=T),
             le_sd=sd(relabun[river=="Leitzaran"]*100, na.rm=T),
              ha_mean=mean(relabun[river=="Harpea"]*100, na.rm=T),
              ha_sd=sd(relabun[river=="Harpea"]*100, na.rm=T),
              er_mean=mean(relabun[river=="Erlan"]*100, na.rm=T),
              er_sd=sd(relabun[river=="Erlan"]*100, na.rm=T),
              go_mean=mean(relabun[river=="Goizueta"]*100, na.rm=T),
              go_sd=sd(relabun[river=="Goizueta"]*100, na.rm=T)) %>%
    mutate(Total=str_c(round(total_mean,3),"±",round(total_sd,3)),
           Leitzaran=str_c(round(le_mean,3),"±",round(le_sd,3)),
           Harpea=str_c(round(ha_mean,3),"±",round(ha_sd,3)),
           Erlan=str_c(round(er_mean,3),"±",round(er_sd,3)),
           Goizueta=str_c(round(go_mean,3),"±",round(go_sd,3))) %>% 
    arrange(-total_mean) %>% 
    dplyr::select(family,Total,Leitzaran,Goizueta,Harpea,Erlan)
# A tibble: 14 × 6
   family                Total         Leitzaran     Goizueta      Harpea        Erlan        
   <chr>                 <chr>         <chr>         <chr>         <chr>         <chr>        
 1 f__Methylophilaceae   29.667±26.384 12.119±9.771  21.601±26.932 38.61±29.001  54.012±19.565
 2 f__Burkholderiaceae_B 27.235±24.427 43.95±28.971  21.545±25.349 12.11±8.23    22.396±13.818
 3 f__Pseudomonadaceae   17.505±22.204 17.201±21.401 25.097±30.276 24.43±19.778  5.843±6.843  
 4 f__Spirosomaceae      13.217±15.853 20.993±24.425 12.853±12.192 6.742±4.094   8.117±7.118  
 5 f__Moraxellaceae      3.763±9.946   2.005±4.134   3.298±5.524   14.552±25.274 0.868±1.166  
 6 f__Burkholderiaceae   2.468±5.746   0.238±0.715   7.324±8.924   0.795±1.59    0.349±0.803  
 7 f__Burkholderiaceae_A 1.898±6.249   2.562±6.41    3.559±9.588   0±0           0.23±0.65    
 8 f__Sphingomonadaceae  1.403±4.051   0±0           3.001±7.007   0±0           1.884±2.159  
 9 f__Flavobacteriaceae  1.282±4.516   0±0           0.088±0.265   0.503±1.005   4.456±8.258  
10 f__Alteromonadaceae   0.671±1.439   0.93±1.884    0±0           2.259±2.212   0.342±0.439  
11 f__Weeksellaceae      0.37±2.028    0±0           1.234±3.703   0±0           0±0          
12 f__                   0.242±1.198   0±0           0±0           0±0           0.909±2.292  
13 f__UBA6184            0.158±0.866   0±0           0±0           0±0           0.593±1.678  
14 f__Rhodocyclaceae     0.12±0.462    0±0           0.4±0.806     0±0           0±0          

5.2.1.3 Genus level

genus_summary <- genome_counts_filt %>%
  mutate_at(vars(-genome),~./sum(.)) %>% #apply TSS nornalisation
  pivot_longer(-genome, names_to = "sample", values_to = "count") %>% #reduce to minimum number of columns
  left_join(sample_metadata, by = join_by(sample == sample)) %>% #append sample metadata
  left_join(genome_metadata, by = join_by(genome == genome)) %>% #append genome metadata
  group_by(sample,phylum,genus, environment) %>%
  summarise(relabun=sum(count)) 
# %>%
#   filter(genus != "g__") %>%
#   mutate(genus= sub("^g__", "", genus))

genus_summary %>%
    group_by(genus) %>%
    summarise(total_mean=mean(relabun*100, na.rm=T),
              total_sd=sd(relabun*100, na.rm=T),
              High_mean=mean(relabun[environment=="high"]*100, na.rm=T),
             High_sd=sd(relabun[environment=="high"]*100, na.rm=T),
              Low_mean=mean(relabun[environment=="low"]*100, na.rm=T),
              Low_sd=sd(relabun[environment=="low"]*100, na.rm=T)) %>%
    mutate(Total=str_c(round(total_mean,3),"±",round(total_sd,3)),
           High=str_c(round(High_mean,3),"±",round(High_sd,3)),
           Low=str_c(round(Low_mean,3),"±",round(Low_sd,3))) %>% 
    arrange(-total_mean) %>% 
    dplyr::select(genus,Total,High,Low) %>% 
  tt()
genus Total High Low
g__Methylotenera 17.548±22.587 37.152±23.413 4.479±7.969
g__Pseudomonas_E 17.505±22.204 12.039±14.84 21.149±25.756
g__Rhodoferax_C 15.176±22.446 6.546±5.573 20.929±27.417
g__Methylophilus 10.821±17.174 10.294±8.855 11.172±21.263
g__Arcicella 7.66±9.217 6.484±6.555 8.443±10.746
g__JALRIJ01 5.558±12.997 1.174±2.749 8.48±16.145
g__Ideonella 4.853±8.717 8.171±10.782 2.641±6.439
g__Acinetobacter 3.763±9.946 5.429±14.848 2.652±4.779
g__Paucibacter_A 3.522±7.505 2.982±2.89 3.881±9.504
g__JADJBS01 2.974±12.106 0±0 4.956±15.479
g__JAEZVV01 1.898±6.249 0.153±0.531 3.061±7.928
g__Telluria 1.495±5.285 0.233±0.663 2.336±6.745
g__Flavobacterium 1.282±4.516 3.138±6.889 0.044±0.188
g__Sphingobium 1.066±3.922 0.415±0.626 1.5±5.049
g__Undibacterium 0.973±2.645 0.265±0.918 1.445±3.287
g__ 0.77±1.988 1.019±2.208 0.605±1.841
g__Pararheinheimera 0.671±1.439 0.981±1.532 0.465±1.378
g__JAAFJR01 0.377±1.888 0.943±2.969 0±0
g__Chryseobacterium 0.37±2.028 0±0 0.617±2.618
g__Novosphingobium 0.336±1.068 0.841±1.594 0±0
g__Pseudorhodoferax 0.334±0.916 0.325±0.995 0.34±0.889
g__UBA6184 0.158±0.866 0.395±1.37 0±0
g__Zoogloea 0.12±0.462 0±0 0.2±0.59
genus_summary_sort <- genus_summary %>%
    group_by(genus) %>%
    summarise(mean=mean(relabun, na.rm=T),sd=sd(relabun, na.rm=T)) %>%
    arrange(-mean) 

genus_summary %>%
  mutate(genus=factor(genus, levels=rev(genus_summary_sort %>% pull(genus)))) %>%
  filter(relabun > 0) %>%
  ggplot(aes(x=relabun, y=genus, group=genus, color=phylum)) +
  scale_color_manual(values=phylum_colors) +
  geom_jitter(alpha=0.5) + 
  facet_grid(.~environment)+
  theme_minimal() + 
  theme(axis.text.y = element_text(size=6))+
  labs(y="Family", x="Relative abundance", color="Phylum")

Number of MAGs without genera-level annotation

5
phylum count_nogene count_total percentage
p__ 2 2 100.000000
p__Pseudomonadota 3 35 8.571429

Percentage of MAGs without genus-level annotation

11.62791

Number of MAGs without species-level annotation

# A tibble: 1 × 1
  Mag_nospecies
          <int>
1            33
phylum count_nospecies count_total species_annotated percentage_non_species percentage_species
p__ 2 2 0 100.00000 0.00000
p__Bacteroidota 5 6 1 83.33333 16.66667
p__Pseudomonadota 26 35 9 74.28571 25.71429

Percentage of MAGs without species-level annotation

76.74419